configfile: "config/revision1.yaml"

# singularity: "docker://alzhang/scrna-analysis-follicular-3.5:v1.7"
singularity: "docker://alzhang/spectrum-master-rstudio:v1.4"

rule all:
    input:
        follicular_cellassign_annotated='{workdir}/sce_follicular_celltype_annotated.rds'.format(workdir=config['workdir']),
        tcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_tcells.rds'.format(outdir=config['outdir']),
        bcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_bcells.rds'.format(outdir=config['outdir']),
        de_tables_reactome=expand('{outdir}/differential_expression/ReactomePA/timepoint/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']), celltype_group=config['differential_expression_settings']['ReactomePA']['celltype_groups'], patient_group=config['differential_expression_settings']['ReactomePA']['patient_groups']),
        de_res=expand('{outdir}/differential_expression/ReactomePA/malignant_timepoint/b_cells/{{patient_group}}.rds'.format(outdir=config['outdir']), patient_group=config['differential_expression_settings']['ReactomePA']['patient_groups']),
        de_tables_fgsea=expand('{outdir}/differential_expression/fgsea/timepoint/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']), celltype_group=config['differential_expression_settings']['fgsea']['celltype_groups'], patient_group=config['differential_expression_settings']['fgsea']['patient_groups']),
        follicular_scvis_train='{workdir}/sce_follicular_annotated_final_scvis_train.rds'.format(workdir=config['workdir']),
        follicular_RLN_merged=ancient('{workdir}/sce_follicular_RLN_merged.rds'.format(workdir=config['workdir'])),
        lk_assignments='{outdir}/lambda_kappa_assignments.rds'.format(outdir=config['outdir']),
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),


# Create SingleCellExperiment
rule create_follicular_sce:
    input:
        filtered_matrices=[config['follicular_data'][x]['filtered_matrix_path'] for x in config['follicular_data']],
    output:
        '{workdir}/sce_follicular.rds'.format(workdir=config['workdir'])
    params:
        name='create-follicular-sce',
        sample_names=[config['follicular_data'][x]['dataset'] for x in config['follicular_data']],
        timepoints=[config['follicular_data'][x]['timepoint'] for x in config['follicular_data']],
        progression_statuses=[config['follicular_data'][x]['progression_status'] for x in config['follicular_data']],
        patient_progression_statuses=[config['follicular_data'][x]['patient_progression_status'] for x in config['follicular_data']],
        patients=[config['follicular_data'][x]['patient'] for x in config['follicular_data']],
    log:
        '{logdir}/logs/create_follicular_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/create_follicular_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/create_sce.R '
        '--sample_names {params.sample_names} '
        '--filtered_matrices {input.filtered_matrices} '
        '--timepoints {params.timepoints} '
        '--progression {params.progression_statuses} '
        '--patient_progression {params.patient_progression_statuses} '
        '--patients {params.patients} '
        '--outfname {output} '
        '>& {log}'


# Filter and normalize SingleCellExperiment
rule preprocess_follicular_sce:
    input:
        follicular_raw='{workdir}/sce_follicular.rds'.format(workdir=config['workdir']),
    output:
        follicular_normalized='{workdir}/sce_follicular_normalized.rds'.format(workdir=config['workdir']),
    params:
        name='preprocess-follicular-sce',
        mito_thres=config['filter_settings']['mito_thres'],
        ribo_thres=config['filter_settings']['ribo_thres'],
    log:
        '{logdir}/logs/preprocess_follicular_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/preprocess_follicular_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/preprocess_sce.R '
        '--sce {input.follicular_raw} '
        '--mito_thres {params.mito_thres} '
        '--ribo_thres {params.ribo_thres} '
        '--outfname {output} '
        '>& {log}'

# Cell cycle assignments (TODO: Run this in multi-core mode)
rule cyclone_follicular_sce:
    input:
        follicular_normalized='{workdir}/sce_follicular_normalized.rds'.format(workdir=config['workdir']),
    output:
        cc_result='{outdir}/cyclone/cc_result.rds'.format(outdir=config['outdir']),
    params:
        name='cyclone-follicular-sce',
    log:
        '{logdir}/logs/cyclone_follicular_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cyclone_follicular_sce.txt'.format(logdir=config['logdir'])
    threads: 15
    shell:
        'Rscript R/cyclone_sce.R '
        '--sce {input.follicular_normalized} '
        '--ncpus {threads} '
        '--outfname {output} '
        '>& {log}'

# Assign cells to B or T classes
rule cellassign_follicular_small:
    input:
        follicular_normalized='{workdir}/sce_follicular_normalized.rds'.format(workdir=config['workdir']),
        marker_list_small=config['marker_lists']['small']['path'],
    output:
        broad_assignments='{outdir}/broad_assignments.rds'.format(outdir=config['outdir']),
    params:
        name='cellassign-follicular-small',
        include_other=config['marker_lists']['small']['include_other'],
        num_runs=config['cellassign_settings']['num_runs'],
        B=config['cellassign_settings']['B'],
        extra_opts=config['cellassign_settings']['extra_opts'],
    threads: 15
    log:
        '{logdir}/logs/cellassign_follicular_small.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cellassign_follicular_small.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/cellassign_sce.R '
        '--sce {input.follicular_normalized} '
        '--marker_list {input.marker_list_small} '
        '--include_other {params.include_other} '
        '--num_runs {params.num_runs} '
        '--rbf_pieces {params.B} '
        '{params.extra_opts} '
        '--outfname {output.broad_assignments} '
        '>& {log}'

# Assign cells to specific subtypes, including cytotoxic T
rule cellassign_follicular_big:
    input:
        follicular_normalized='{workdir}/sce_follicular_normalized.rds'.format(workdir=config['workdir']),
        marker_list_big=config['marker_lists']['big']['path'],
    output:
        specific_assignments='{outdir}/specific_assignments.rds'.format(outdir=config['outdir']),
    params:
        name='cellassign-follicular-big',
        include_other=config['marker_lists']['big']['include_other'],
        num_runs=config['cellassign_settings']['num_runs'],
        B=config['cellassign_settings']['B'],
        extra_opts=config['cellassign_settings']['extra_opts'],
    threads: 15
    log:
        '{logdir}/logs/cellassign_follicular_big.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cellassign_follicular_big.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/cellassign_sce.R '
        '--sce {input.follicular_normalized} '
        '--marker_list {input.marker_list_big} '
        '--include_other {params.include_other} '
        '--num_runs {params.num_runs} '
        '--rbf_pieces {params.B} '
        '{params.extra_opts} '
        '--outfname {output.specific_assignments} '
        '>& {log}'


# Annotate follicular SCE with assignments
rule annotate_cellassign:
    input:
        follicular_normalized='{workdir}/sce_follicular_normalized.rds'.format(workdir=config['workdir']),
        broad_assignments='{outdir}/broad_assignments.rds'.format(outdir=config['outdir']),
        specific_assignments='{outdir}/specific_assignments.rds'.format(outdir=config['outdir']),
    output:
        follicular_cellassign_annotated='{workdir}/sce_follicular_celltype_annotated.rds'.format(workdir=config['workdir']),
    params:
        name='annotate-cellassign',
    log:
        '{logdir}/logs/annotate_cellassign.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/annotate_cellassign.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/annotate_cellassign.R '
        '--sce {input.follicular_normalized} '
        '--broad {input.broad_assignments} '
        '--specific {input.specific_assignments} '
        '--outfname {output.follicular_cellassign_annotated} '
        '>& {log}'

# Call individual B cells as malignant/nonmalignant
# NEEDS TO BE EXAMINED EVERY TIME ITS INPUTS CHANGE
rule determine_malignant_status:
    input:
        follicular_cellassign_annotated='{workdir}/sce_follicular_celltype_annotated.rds'.format(workdir=config['workdir']),
    output:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
    params:
        name='determine-malignant-status',
    log:
        '{logdir}/logs/determine_malignant_status.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/determine_malignant_status.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/determine_malignant_status.R '
        '--sce {input.follicular_cellassign_annotated} '
        '--outfname {output.follicular_malignant_annotated} '
        '>& {log}'

# Unsupervised clustering of T cells
rule follicular_unsupervised_t:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_t_assignments='{outdir}/unsupervised_t_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='follicular-unsupervised-t',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['follicular_t']['methods'],
        cluster_use_method=config['cluster_settings']['follicular_t']['use_method'],
        celltypes=config['cluster_settings']['follicular_t']['celltypes'],
    log:
        '{logdir}/logs/follicular_unsupervised_t.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/follicular_unsupervised_t.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.follicular_malignant_annotated} '
        '--celltypes {params.celltypes} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_t_assignments} '
        '>& {log}'

# Unsupervised clustering of malignant B cells
rule follicular_unsupervised_malignant:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_malignant_assignments='{outdir}/unsupervised_malignant_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='follicular-unsupervised-malignant',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['follicular_malignant']['methods'],
        cluster_use_method=config['cluster_settings']['follicular_malignant']['use_method'],
        celltypes=config['cluster_settings']['follicular_malignant']['celltypes'],
    log:
        '{logdir}/logs/follicular_unsupervised_malignant.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/follicular_unsupervised_malignant.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.follicular_malignant_annotated} '
        '--celltypes {params.celltypes} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_malignant_assignments} '
        '>& {log}'

# Unsupervised clustering of nonmalignant B cells
rule follicular_unsupervised_b:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_b_assignments='{outdir}/unsupervised_b_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='follicular-unsupervised-b',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['follicular_b']['methods'],
        cluster_use_method=config['cluster_settings']['follicular_b']['use_method'],
        celltypes=config['cluster_settings']['follicular_b']['celltypes'],
    log:
        '{logdir}/logs/follicular_unsupervised_b.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/follicular_unsupervised_b.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.follicular_malignant_annotated} '
        '--celltypes {params.celltypes} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_b_assignments} '
        '>& {log}'


# Unsupervised clustering of all cells
rule follicular_unsupervised_all:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_all_assignments='{outdir}/unsupervised_all_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='follicular-unsupervised-all',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['follicular_all']['methods'],
        cluster_use_method=config['cluster_settings']['follicular_all']['use_method'],
    log:
        '{logdir}/logs/follicular_unsupervised_all.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/follicular_unsupervised_all.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.follicular_malignant_annotated} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_all_assignments} '
        '>& {log}'

# Unsupervised clustering of all cells, using a subset of genes (marker genes)
rule follicular_unsupervised_all_subset:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
        marker_list_big=config['marker_lists']['big']['path'],
    output:
        unsupervised_all_subset_assignments='{outdir}/unsupervised_all_subset_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='follicular-unsupervised-all-subset',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['follicular_all_subset']['methods'],
        cluster_use_method=config['cluster_settings']['follicular_all_subset']['use_method'],
    log:
        '{logdir}/logs/follicular_unsupervised_all_subset.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/follicular_unsupervised_all_subset.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.follicular_malignant_annotated} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--marker_list {input.marker_list_big} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_all_subset_assignments} '
        '>& {log}'

# Annotate follicular SCE with assignments
rule annotate_follicular_final:
    input:
        follicular_malignant_annotated='{workdir}/sce_follicular_malignant_annotated.rds'.format(workdir=config['workdir']),
        unsupervised_t_assignments='{outdir}/unsupervised_t_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_malignant_assignments='{outdir}/unsupervised_malignant_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_b_assignments='{outdir}/unsupervised_b_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_all_assignments='{outdir}/unsupervised_all_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_all_subset_assignments='{outdir}/unsupervised_all_subset_assignments.tsv'.format(outdir=config['outdir']),
        cc_result='{outdir}/cyclone/cc_result.rds'.format(outdir=config['outdir']),
    output:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    params:
        name='annotate-follicular-final',
    log:
        '{logdir}/logs/annotate_follicular_final.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/annotate_follicular_final.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/annotate_follicular_final.R '
        '--sce {input.follicular_malignant_annotated} '
        '--unsupervised_t {input.unsupervised_t_assignments} '
        '--unsupervised_malignant {input.unsupervised_malignant_assignments} '
        '--unsupervised_b {input.unsupervised_b_assignments} '
        '--unsupervised_all {input.unsupervised_all_assignments} '
        '--unsupervised_all_subset {input.unsupervised_all_subset_assignments} '
        '--cyclone {input.cc_result} '
        '--outfname {output.follicular_annotated} '
        '>& {log}'

rule differential_expression_timepoint_by_celltype:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        de_tables='{outdir}/differential_expression/ReactomePA/timepoint/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        celltype_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['celltype_groups'][wildcards.celltype_group],
        patient_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['patient_groups'][wildcards.patient_group],
        de_method=config['differential_expression_settings']['ReactomePA']['gene_method'],
        num_genes=config['differential_expression_settings']['ReactomePA']['num_genes'],
        min_gene_counts=config['differential_expression_settings']['ReactomePA']['min_gene_counts'],
        name='differential-expression-reactome-timepoint-{celltype_group}-{patient_group}',
    log:
        '{logdir}/logs/differential_expression_timepoint/ReactomePA/{{celltype_group}}/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_timepoint/ReactomePA/{{celltype_group}}/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_timepoint.R '
        '--sce {input.follicular_annotated} '
        '--celltypes {params.celltype_group} '
        '--patients {params.patient_group} '
        '--method_gene {params.de_method} '
        '--method_pathway ReactomePA '
        '--ngene {params.num_genes} '
        '--outfname {output.de_tables} '
        '>& {log}'

rule differential_expression_bcell_malignant_timepoint:
    input:
        bcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_bcells.rds'.format(outdir=config['outdir']),
    output:
        de_res='{outdir}/differential_expression/ReactomePA/malignant_timepoint/b_cells/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        min_gene_counts=config['differential_expression_settings']['ReactomePA']['min_gene_counts'],
        patient_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['patient_groups'][wildcards.patient_group],
        num_genes=config['differential_expression_settings']['ReactomePA']['num_genes'],
        name='differential-expression-reactome-bcell-malignant-timepoint-{patient_group}',
        bcell_labels=config['b_cell_labels'],
    log:
        '{logdir}/logs/differential_expression_bcell_malignant_timepoint/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_bcell_malignant_timepoint/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_b_cells_malignant_timepoint.R '
        '--sce {input.bcells_annotated} '
        '--min_gene_counts {params.min_gene_counts} '
        '--patients {params.patient_group} '
        '--bcell_labels {params.bcell_labels} '
        '--ngene {params.num_genes} '
        '--outfname {output.de_res} '
        '>& {log}'

rule differential_expression_fgsea_timepoint:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        de_tables='{outdir}/differential_expression/fgsea/timepoint/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        celltype_group=lambda wildcards: config['differential_expression_settings']['fgsea']['celltype_groups'][wildcards.celltype_group],
        patient_group=lambda wildcards: config['differential_expression_settings']['fgsea']['patient_groups'][wildcards.patient_group],
        de_method=config['differential_expression_settings']['fgsea']['gene_method'],
        min_gene_counts=config['differential_expression_settings']['fgsea']['min_gene_counts'],
        gene_set_file=config['gene_sets']['hallmark'],
        name='differential-expression-timepoint-fgsea-{celltype_group}-{patient_group}',
    log:
        '{logdir}/logs/differential_expression_timepoint/fgsea/{{celltype_group}}/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_timepoint/fgsea/{{celltype_group}}/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_timepoint.R '
        '--sce {input.follicular_annotated} '
        '--celltypes {params.celltype_group} '
        '--patients {params.patient_group} '
        '--method_gene {params.de_method} '
        '--method_pathway fgsea '
        '--gene_set_file {params.gene_set_file} '
        '--outfname {output.de_tables} '
        '>& {log}'

rule filter_tcells:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        tcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_tcells.rds'.format(outdir=config['outdir']),
    params:
        name='filter-tcells',
        t_cell_probability=config['celltype_filters']['broad_t_cell_probability'],
    log: 
        '{logdir}/logs/filter_subset/filter_tcells.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/filter_subset/filter_tcells.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/filter_cells.R '
        '--sce {input.follicular_annotated} '
        '--celltype T\ cell '
        '--malignancy_filter nonmalignant '
        '--celltype_probability {params.t_cell_probability} '
        '--outfname {output.tcells_annotated} '
        '>& {log}'

rule filter_bcells:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        bcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_bcells.rds'.format(outdir=config['outdir']),
    params:
        name='filter-bcells',
        b_cell_probability=config['celltype_filters']['broad_b_cell_probability'],
    log: 
        '{logdir}/logs/filter_subset/filter_bcells.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/filter_subset/filter_bcells.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/filter_cells.R '
        '--sce {input.follicular_annotated} '
        '--celltype B\ cell '
        '--malignancy_filter all '
        '--celltype_probability {params.b_cell_probability} '
        '--outfname {output.bcells_annotated} '
        '>& {log}'


rule filter_nonmalignant_bcells:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        nonmalignant_bcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_nonmalignant_bcells.rds'.format(outdir=config['outdir']),
    params:
        name='filter-bcells',
        b_cell_probability=config['celltype_filters']['broad_b_cell_probability'],
        nonmalignant_b_cell_categories=config['celltype_full_groups']['nonmalignant_b'],
    log: 
        '{logdir}/logs/filter_subset/filter_nonmalignant_bcells.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/filter_subset/filter_nonmalignant_bcells.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/filter_cells.R '
        '--sce {input.follicular_annotated} '
        '--celltype B\ cell '
        '--celltype_full {params.nonmalignant_b_cell_categories} '
        '--malignancy_filter nonmalignant '
        '--celltype_probability {params.b_cell_probability} '
        '--outfname {output.nonmalignant_bcells_annotated} '
        '>& {log}'

# Assign nonmalignant B/T cells to lambda/kappa
rule lambda_kappa_classification:
    input:
        nonmalignant_bcells_annotated='{outdir}/sce_annotated_subset/sce_follicular_nonmalignant_bcells.rds'.format(outdir=config['outdir']),
        marker_list_lk=config['marker_lists']['lambdakappa']['path'],
    output:
        lk_assignments='{outdir}/lambda_kappa_assignments.rds'.format(outdir=config['outdir']),
    params:
        name='lambda-kappa-classification',
        include_other=config['marker_lists']['lambdakappa']['include_other'],
        num_runs=config['cellassign_settings']['num_runs'],
        B=config['cellassign_settings']['B'],
        extra_opts=config['cellassign_settings']['extra_opts'],
    threads: 15
    log:
        '{logdir}/logs/lambda_kappa_classification.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/lambda_kappa_classification.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/cellassign_sce.R '
        '--sce {input.nonmalignant_bcells_annotated} '
        '--marker_list {input.marker_list_lk} '
        '--include_other {params.include_other} '
        '--num_runs {params.num_runs} '
        '--rbf_pieces {params.B} '
        '{params.extra_opts} '
        '--outfname {output.lk_assignments} '
        '>& {log}'

### RLN processing

# Create SingleCellExperiment
rule create_rln_sce:
    input:
        filtered_matrices=[config['RLN_data'][x]['filtered_matrix_path'] for x in config['RLN_data']],
    output:
        '{workdir}/sce_RLN.rds'.format(workdir=config['workdir'])
    params:
        name='create-rln-sce',
        sample_names=[config['RLN_data'][x]['dataset'] for x in config['RLN_data']],
        timepoints=[config['RLN_data'][x]['timepoint'] for x in config['RLN_data']],
        progression_statuses=[config['RLN_data'][x]['progression_status'] for x in config['RLN_data']],
        patient_progression_statuses=[config['RLN_data'][x]['patient_progression_status'] for x in config['RLN_data']],
        patients=[config['RLN_data'][x]['patient'] for x in config['RLN_data']],
    log:
        '{logdir}/logs/create_rln_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/create_rln_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/create_sce.R '
        '--sample_names {params.sample_names} '
        '--filtered_matrices {input.filtered_matrices} '
        '--timepoints {params.timepoints} '
        '--progression {params.progression_statuses} '
        '--patient_progression {params.patient_progression_statuses} '
        '--patients {params.patients} '
        '--outfname {output} '
        '>& {log}'


# Filter and normalize SingleCellExperiment
rule preprocess_rln_sce:
    input:
        RLN_raw='{workdir}/sce_RLN.rds'.format(workdir=config['workdir']),
    output:
        RLN_normalized='{workdir}/sce_RLN_normalized.rds'.format(workdir=config['workdir']),
    params:
        name='preprocess-rln-sce',
        mito_thres=config['filter_settings']['mito_thres'],
        ribo_thres=config['filter_settings']['ribo_thres'],
    log:
        '{logdir}/logs/preprocess_rln_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/preprocess_rln_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/preprocess_sce.R '
        '--sce {input.RLN_raw} '
        '--mito_thres {params.mito_thres} '
        '--ribo_thres {params.ribo_thres} '
        '--outfname {output.RLN_normalized} '
        '>& {log}'

rule batch_correct:
    input:
        follicular_annotated='{outdir}/sce_follicular_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        follicular_bc='{workdir}/sce_follicular_annotated_final_bc.rds'.format(workdir=config['workdir']),
    params:
        name='batch-correct',
        batch_col=config['batch_correction_settings']['batch_variable'],
        method=config['batch_correction_settings']['method'],
        conda_env='r-tensorflow',
    log:
        '{logdir}/logs/batch_correct.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/batch_correct.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/batch_correct.R '
        '--sce {input.follicular_annotated} '
        '--conda_env {params.conda_env} '
        '--method {params.method} '
        '--batch_column {params.batch_col} '
        '--outfname {output.follicular_bc} '
        '>& {log}'

rule scvis_follicular_train:
    input:
        follicular_bc='{workdir}/sce_follicular_annotated_final_bc.rds'.format(workdir=config['workdir']),
    output:
        follicular_scvis_train='{workdir}/sce_follicular_annotated_final_scvis_train.rds'.format(workdir=config['workdir']),
        scvis_output_dir=directory('{workdir}/scvis/train_follicular'.format(workdir=config['workdir'])),
    params:
        name='scvis-follicular-train',
        config_path=config['scvis']['default_config'],
        dimname=config['scvis']['dimname'],
        conda_env='r-tensorflow',
    log:
        '{logdir}/logs/scvis_follicular_train.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/scvis_follicular_train.txt'.format(logdir=config['logdir'])
    threads: 15
    shell:
        'Rscript R/scvis_train.R '
        '--sce {input.follicular_bc} '
        '--conda_env {params.conda_env} '
        '--scvis_config_path {params.config_path} '
        '--scvis_dimname {params.dimname} '
        '--scvis_output_dir {output.scvis_output_dir} '
        '--outfname {output.follicular_scvis_train} '
        '>& {log}'

rule scvis_RLN_map:
    input:
        follicular_scvis_train='{workdir}/sce_follicular_annotated_final_scvis_train.rds'.format(workdir=config['workdir']),
        RLN_normalized='{workdir}/sce_RLN_normalized.rds'.format(workdir=config['workdir']),
        scvis_output_dir='{workdir}/scvis/train_follicular'.format(workdir=config['workdir']),
    output:
        RLN_mapped='{workdir}/sce_RLN_mapped.rds'.format(workdir=config['workdir']),
        follicular_RLN_merged='{workdir}/sce_follicular_RLN_merged.rds'.format(workdir=config['workdir']),
    params:
        name='scvis-RLN-map',
        config_path=config['scvis']['default_config'],
        conda_env='r-tensorflow',
    log:
        '{logdir}/logs/scvis_RLN_map.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/scvis_RLN_map.txt'.format(logdir=config['logdir'])
    threads: 15
    shell:
        'Rscript R/scvis_map.R '
        '--sce {input.follicular_scvis_train} '
        '--sce_rln {input.RLN_normalized} '
        '--conda_env {params.conda_env} '
        '--scvis_config_path {params.config_path} '
        '--scvis_output_dir {input.scvis_output_dir} '
        '--out_rln {output.RLN_mapped} '
        '--out_merged {output.follicular_RLN_merged} '
        '>& {log}'
